Android 15 线程挂起超时崩溃与修复
本文作者
作者:巴黎没有摩天轮Li
链接:
https://juejin.cn/post/7390341683601014824
本文由作者授权发布。
背景
Android 线程挂起超时崩溃与修复
https://juejin.cn/post/7364409181053206554
Android 线程挂起超时崩溃与修复 - 续集
https://juejin.cn/post/7379060488351399946
Android Native 线程挂起流程
https://juejin.cn/post/7372572344248516635
由于前因后果已经在前三篇文章中详细梳理了本篇直接上方案。
估计 Google 对线程挂起流程也觉得有可优化的地方吧,此次挂起流程调整的相对来说比较大了,有兴趣可以在 thread_list.cc 中详细看到变更点。
https://cs.android.com/android/_/android/platform/art/+/d00d24530a29b684bec9a895c1da491a6390395f:runtime/thread_list.cc;dlc=7b7adc7f774f1237950ee9a5b9e3d2afbd8300d9
SuspendThreadByPeer & SuspendThreadByThreadId
我们之前的hook手段都是围绕这两个函数做文章的,这回在 Android 15中,挂起逻辑被整合到SuspendThread 函数中了,我们看下具体的挂起实现。
https://cs.android.com/android/platform/superproject/main/+/main:art/runtime/thread_list.cc;drc=934ce638055e09afd43ec344a2bdf8060fb91978;bpv=1;bpt=1;l=1045?q=thread_list.cc&gsn=SuspendThread&gs=KYTHE%3A%2F%2Fkythe%3A%2F%2Fandroid.googlesource.com%2Fplatform%2Fsuperproject%2Fmain%2F%2Fmain%3Flang%3Dc%252B%252B%3Fpath%3Dart%2Fruntime%2Fthread_list.cc%23Gx1QFEjlvX8eapqRF26zlSvkbEKo9c2evLMrn3Hb3s0
SuspendThread
bool ThreadList::SuspendThread(Thread* self,
Thread* thread,
SuspendReason reason,
ThreadState self_state,
const char* func_name,
int attempt_of_4) {
bool is_suspended = false;
VLOG(threads) << func_name << "starting";
pid_t tid = thread->GetTid();
uint8_t suspended_count;
uint8_t checkpoint_count;
WrappedSuspend1Barrier wrapped_barrier{}; // 挂起栅栏
static_assert(sizeof wrapped_barrier.barrier_ == sizeof(uint32_t));
ThreadExitFlag tef;
bool exited = false;
thread->NotifyOnThreadExit(&tef);
int iter_count = 1;
do {
{
Locks::mutator_lock_->AssertSharedHeld(self);
Locks::thread_list_lock_->AssertHeld(self);
// Note: this will transition to runnable and potentially suspend.
DCHECK(Contains(thread));
// This implementation fails if thread == self. Let the clients handle that case
// appropriately.
CHECK_NE(thread, self) << func_name << "(self)";
VLOG(threads) << func_name << " suspending: " << *thread;
{
MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
if (LIKELY(self->GetSuspendCount() == 0)) {
suspended_count = thread->suspended_count_;
checkpoint_count = thread->checkpoint_count_;
// 老样子,设置一个挂起标记位
thread->IncrementSuspendCount(self, nullptr, &wrapped_barrier, reason);
if (thread->IsSuspended()) {
// 如果挂起就移除挂起栅栏
// See the discussion in mutator_gc_coord.md and SuspendAllInternal for the race here.
thread->RemoveFirstSuspend1Barrier(&wrapped_barrier);
if (!thread->HasActiveSuspendBarrier()) {
thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
}
// 直接返回挂起成功
is_suspended = true;
}
DCHECK_GT(thread->GetSuspendCount(), 0);
break;
}
}
}
// All locks are released, and we should quickly exit the suspend-unfriendly state. Retry.
if (iter_count >= kMaxSuspendRetries) {
LOG(FATAL) << "Too many suspend retries";
}
Locks::thread_list_lock_->ExclusiveUnlock(self);
{
ScopedThreadSuspension sts(self, ThreadState::kSuspended);
usleep(kThreadSuspendSleepUs);
++iter_count;
}
Locks::thread_list_lock_->ExclusiveLock(self);
exited = tef.HasExited();
} while (!exited);
thread->UnregisterThreadExitFlag(&tef);
Locks::thread_list_lock_->ExclusiveUnlock(self);
self->TransitionFromRunnableToSuspended(self_state);
if (exited) {
return false;
}
// Now wait for target to decrement suspend barrier.
std::optional<std::string> failure_info;
if (!is_suspended) {
// 如果还没有迅速挂起,则走一个超时计时逻辑,并将wrapped_barrier结构体传入
failure_info = WaitForSuspendBarrier(&wrapped_barrier.barrier_, tid, attempt_of_4);
if (!failure_info.has_value()) {
// 如果返回值没有值 则说明已经立即挂起。
is_suspended = true;
}
}
while (!is_suspended) {
// 未挂起,则陷入死循环
if (attempt_of_4 > 0 && attempt_of_4 < 4) {
MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
if (wrapped_barrier.barrier_.load() == 0) {
// 获取一下栅栏中的一个 int 类型的原子变量值是否为 0,为 0则代表挂起成功没有超时。
// Succeeded in the meantime.
is_suspended = true;
continue;
}
// 移除栅栏
thread->RemoveSuspend1Barrier(&wrapped_barrier);
if (!thread->HasActiveSuspendBarrier()) {
thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
}
thread->DecrementSuspendCount(self,
/*for_user_code=*/(reason == SuspendReason::kForUserCode));
Thread::resume_cond_->Broadcast(self);
return false;
}
std::string name;
thread->GetThreadName(name);
WrappedSuspend1Barrier* first_barrier;
{
MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
first_barrier = thread->tlsPtr_.active_suspend1_barriers;
}
// 组合一个错误信息 重点。
std::string message = StringPrintf(
"%s timed out: %d (%s), state&flags: 0x%x, priority: %d,"
" barriers: %p, ours: %p, barrier value: %d, nsusps: %d, ncheckpts: %d, thread_info: %s",
func_name,
thread->GetTid(),
name.c_str(),
thread->GetStateAndFlags(std::memory_order_relaxed).GetValue(),
thread->GetNativePriority(),
first_barrier,
&wrapped_barrier,
wrapped_barrier.barrier_.load(),
thread->suspended_count_ - suspended_count,
thread->checkpoint_count_ - checkpoint_count,
failure_info.value().c_str());
if (wrapped_barrier.barrier_.load() != 0) {
// 触发崩溃
thread->AbortInThis(message);
UNREACHABLE();
}
is_suspended = true;
}
// ignore ...
return true;
}
WaitForSuspendBarrier
std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barrier,
pid_t t,
int attempt_of_4) {
#if ART_USE_FUTEXES
const uint64_t start_time = NanoTime();
#endif
uint64_t timeout_ns =
attempt_of_4 == 0 ? thread_suspend_timeout_ns_ : thread_suspend_timeout_ns_ / 4;
if (attempt_of_4 != 1 && getpriority(PRIO_PROCESS, 0 /* this thread */) > 0) {
// 我们是一个低优先级线程,因此有更长的 ANR 超时时间。将挂起超时加倍。
// 为了避免在常见情况下调用 getpriority 系统调用,我们在四次等待的第一次未加倍,
// 但在第三次将其增加三倍以补偿。
if (attempt_of_4 == 3) {
timeout_ns *= 3;
} else {
timeout_ns *= 2;
}
}
bool collect_state = (t != 0 && (attempt_of_4 == 0 || attempt_of_4 == 4));
int32_t cur_val = barrier->load(std::memory_order_acquire);
if (cur_val <= 0) {
DCHECK_EQ(cur_val, 0);
return std::nullopt;
}
unsigned i = 0;
if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns)) {
i = 1;
}
cur_val = barrier->load(std::memory_order_acquire);
if (cur_val <= 0) {
DCHECK_EQ(cur_val, 0);
return std::nullopt;
}
// 长时间等待;在超时情况下收集信息。
std::string sampled_state = collect_state ? GetOsThreadStatQuick(t) : "";
while (i < kSuspendBarrierIters) {
if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns)) {
++i;
#if ART_USE_FUTEXES
if (!kShortSuspendTimeouts) {
CHECK_GE(NanoTime() - start_time, i * timeout_ns / kSuspendBarrierIters - 1'000'000);
}
#endif
}
cur_val = barrier->load(std::memory_order_acquire);
if (cur_val <= 0) {
DCHECK_EQ(cur_val, 0);
return std::nullopt;
}
}
return collect_state ? "Target states: [" + sampled_state + ", " + GetOsThreadStatQuick(t) + "]" +
std::to_string(cur_val) + "@" + std::to_string((uintptr_t)barrier) +
" Final wait time: " + PrettyDuration(NanoTime() - start_time) :
"";
}
static constexpr bool kShortSuspendTimeouts = false;
static constexpr unsigned kSuspendBarrierIters = kShortSuspendTimeouts ? 5 : 20;
static constexpr bool kShortSuspendTimeouts = false;
static constexpr unsigned kSuspendBarrierIters = kShortSuspendTimeouts ? 5 : 20;
在正常运行中,kShortSuspendTimeouts 为 false,kSuspendBarrierIters 的值为 20。在调试模式中,kShortSuspendTimeouts 设置为 true,此时 kSuspendBarrierIters 的值为 5。
WaitOnceForSuspendBarrier
// Returns true if it timed out.
static bool WaitOnceForSuspendBarrier(AtomicInteger* barrier,
int32_t cur_val,
uint64_t timeout_ns) {
// 定义一个 timespec 结构体变量,用于存储超时信息。
timespec wait_timeout;
// 判断是否启用了短超时(kShortSuspendTimeouts 为 true)
if (kShortSuspendTimeouts) {
// 将 timeout_ns 设置为 MsToNs(kSuspendBarrierIters)
timeout_ns = MsToNs(kSuspendBarrierIters);
// 检查 timeout_ns / kSuspendBarrierIters 的毫秒值是否大于等于 1
CHECK_GE(NsToMs(timeout_ns / kSuspendBarrierIters), 1ul);
} else {
// 否则,检查 timeout_ns / kSuspendBarrierIters 的毫秒值是否大于等于 10
DCHECK_GE(NsToMs(timeout_ns / kSuspendBarrierIters), 10ul);
}
// 使用 InitTimeSpec 函数初始化 wait_timeout 结构体
// 设置时钟类型为 CLOCK_MONOTONIC,超时值为 timeout_ns / kSuspendBarrierIters 的毫秒值
InitTimeSpec(false, CLOCK_MONOTONIC, NsToMs(timeout_ns / kSuspendBarrierIters), 0, &wait_timeout);
// 调用 futex 系统调用等待屏障的地址值为 cur_val
// FUTEX_WAIT_PRIVATE 表示在当前进程内等待
if (futex(barrier->Address(), FUTEX_WAIT_PRIVATE, cur_val, &wait_timeout, nullptr, 0) != 0) {
// 检查 errno
if (errno == ETIMEDOUT) {
// 如果 errno 为 ETIMEDOUT,表示超时,返回 true
return true;
} else if (errno != EAGAIN && errno != EINTR) {
// 如果 errno 不是 EAGAIN 或 EINTR,记录错误日志并终止程序
PLOG(FATAL) << "futex wait for suspend barrier failed";
}
}
return false;
}
也就是 20次迭代 ✖️ futex挂起超时时间 = 最大挂起等待时间。
ok,到了这里我们就知道新版本的挂起超时检测机制是什么了。
Android 15 的思考方案
https://juejin.cn/post/7379060488351399946#heading-6
#define SYMBOL_STRING_PRINTF "_ZN7android4base12StringPrintfEPKcz"
const char *getStringPrintfFunctionName() {
return SYMBOL_STRING_PRINTF;
}
namespace hookThreadSuspendAbortV15 {
jobject callbackObj = nullptr;
void *originalStringPrintf = nullptr;
typedef void *(*StringPrintf_t)(const char *format, ...);
bool checkFormat(const char *format);
void *proxyStringPrintfFunc(const char *format, ...) {
// todo
return originCallback;
}
void fixNativeThreadSuspend(JNIEnv *env, jobject callback) {
BaseInlineHook baseInlineHook = BaseInlineHook(env);
baseInlineHook.callbackObj = env->NewGlobalRef(callback);
callbackObj = baseInlineHook.callbackObj;
baseInlineHook.setupHook(TARGET_LIB_BASE,
getStringPrintfFunctionName(),
(void *) proxyStringPrintfFunc,
(void **) &originalStringPrintf);
}
得意洋洋的打开 Android 15 虚拟机,模拟了线程挂起超时的崩溃,但是回调函数没有被调用,百思不得其解,于是也去 ShadowHook 上创建了一个讨论。
https://github.com/bytedance/android-inline-hook/discussions/70
理论上,我理解使用 inline-hook 去 hook libart.so 中的函数A, 函数A 间接调用了 libbase.so 中的函数B , 我直接 hook 这个 B 函数,自定义一个 Proxy B,当A调用B时候,这个 Proxy B 理论上也会被调用。
所以又集成了 BHook,果然成功了,也许使用修改 GOT 表中的目标地址更加直接一点,不过 inline-hook 为什么不可以还需要再研究一下。
https://github.com/bytedance/bhook
2024.07.16 更新
方案实现
namespace hookThreadSuspendAbortV15 {
jobject callbackObj = nullptr;
void *stubFunction = nullptr;
bool checkFormat(const char *format);
std::string proxyStringPrintfFunc(const char *format, ...) {
BYTEHOOK_STACK_SCOPE();
if (checkFormat(format)) {
va_list args;
va_start(args, format);
const char *func_name = va_arg(args, const char*); // func_name
va_arg(args, int); // tid
va_arg(args, const char*); // name.c_str()
va_arg(args, int); // state_and_flags
va_arg(args, int); // native_priority
va_arg(args, void*); // first_barrier
using namespace kbArt;
WrappedSuspend1Barrier *wrappedBarrier = va_arg(args, WrappedSuspend1Barrier*);
if (wrappedBarrier != nullptr) {
if (wrappedBarrier->barrier_.load(std::memory_order_acquire) == 0) {
return base::StringPrintf("thread has been suspend : %s", func_name);
}
struct timespec startTime{};
clock_gettime(CLOCK_MONOTONIC, &startTime);
struct timespec ts{};
ts.tv_sec = 0;
ts.tv_nsec = 10000000;
while (true) {
if (wrappedBarrier->barrier_.load(std::memory_order_acquire) == 0) {
struct timespec endTime{};
clock_gettime(CLOCK_MONOTONIC, &endTime);
double waitDuration = (endTime.tv_sec - startTime.tv_sec) + (endTime.tv_nsec - startTime.tv_nsec) / 1e9;
NotifyHandleThreadSuspendTimeout::triggerSuspendTimeout(callbackObj, std::round(waitDuration * 1000) / 1000);
return base::StringPrintf("thread has been suspend : %s, cost time %f", func_name, waitDuration);
}
nanosleep(&ts, nullptr);
}
}
va_end(args);
}
va_list ap;
va_start(ap, format);
std::string result;
base::StringAppendV(&result, format, ap);
va_end(ap);
return result;
}
void fixNativeThreadSuspend(JNIEnv *env, jobject callback) {
callbackObj = env->NewGlobalRef(callback);
if(stubFunction != nullptr){
bytehook_unhook(stubFunction);
stubFunction = nullptr;
}
stubFunction = bytehook_hook_single(TARGET_ART_LIB,
nullptr,
getStringPrintfFunctionName(),
reinterpret_cast<void *>(proxyStringPrintfFunc),
nullptr,
nullptr);
if (stubFunction != nullptr) {
__android_log_print(ANDROID_LOG_INFO, LOG_TAG_THREAD_SUSPEND_HOOK, "Hook setup success");
}
}
bool checkFormat(const char *format) {
return
strstr(format, "timed out") != nullptr &&
strstr(format, "state&flags") != nullptr &&
strstr(format, "priority") != nullptr &&
strstr(format, "barriers") != nullptr &&
strstr(format, "ours") != nullptr &&
strstr(format, "barrier value") != nullptr &&
strstr(format, "nsusps") != nullptr &&
strstr(format, "ncheckpts") != nullptr &&
strstr(format, "thread_info") != nullptr;
}
}
void *proxyStringPrintfFunc(const char *format, ...) {
va_list args;
va_start(args, format);
__android_log_print(ANDROID_LOG_ERROR, LOG_TAG_THREAD_SUSPEND_HOOK_V15, "hit the hook point.");
void *originCallback;
if (checkFormat(format)) {
using namespace kbArt;
WrappedSuspend1Barrier *wrapped_barrier = va_arg(args, WrappedSuspend1Barrier*);
if (wrapped_barrier != nullptr) {
if (wrapped_barrier->barrier_.load() != 0) {
if (callbackObj != nullptr) {
// call the Java callback function.
NotifyHandleThreadSuspendTimeout::triggerSuspendTimeout(callbackObj);
}
// set the barrier to 0 to avoid the abort()...
wrapped_barrier->barrier_.store(0);
__android_log_print(ANDROID_LOG_ERROR, LOG_TAG_THREAD_SUSPEND_HOOK_V15, "set 0");
}
}
originCallback = ((StringPrintf_t) originalStringPrintf)(format, args);
} else {
originCallback = ((StringPrintf_t) originalStringPrintf)(format, args);
}
va_end(args);
return originCallback;
}
小知识点
可变数组的读取。
va_list args;
va_start(args, format);
va_arg(args, const char*); // func_name
va_arg(args, int); // tid
va_arg(args, const char*); // name.c_str()
va_arg(args, int); // state_and_flags
va_arg(args, int); // native_priority
va_arg(args, void*); // first_barrier
// 即便不用也要先取出来然后才能轮到取下一个。
WrappedSuspend1Barrier *wrapped_barrier = va_arg(args, WrappedSuspend1Barrier*);
// 别忘记调用结束
va_end;
保持内存模型一致。
// only for android 15+
// See Thread.tlsPtr_.active_suspend1_barriers below for explanation.
struct WrappedSuspend1Barrier {
// TODO(b/23668816): At least weaken CHECKs to DCHECKs once the bug is fixed.
static constexpr int kMagic = 0xba8;
WrappedSuspend1Barrier() : magic_(kMagic), barrier_(1), next_(nullptr) {}
int magic_;
std::atomic<int32_t> barrier_;
struct WrappedSuspend1Barrier *next_;
};
修复效果
Android 15 修复前崩溃时日志:
Android 15 修复后崩溃时日志:
打印当触发了abort()信号,到真正挂起后这段时间的耗时。
最后推荐一下我做的网站,玩Android: wanandroid.com ,包含详尽的知识体系、好用的工具,还有本公众号文章合集,欢迎体验和收藏!
推荐阅读:
扫一扫 关注我的公众号
如果你想要跟大家分享你的文章,欢迎投稿~
┏(^0^)┛明天见!